#Loading multiple .csv files as separate data frames
getwd()
folder <- "data/"
file_list <- list.files(path = folder, pattern = "*.csv")#Read in each .csv file
for (i in 1:length(file_list)){
assign(file_list[i],
read.csv(paste(folder,file_list[i],sep=''),stringsAsFactors = F)
)}#Rename data
geo <- olist_geolocation_dataset.csv
orders <- olist_orders_dataset.csv
cust <- olist_customers_dataset.csv
sellers <- olist_sellers_dataset.csv
products <- olist_products_dataset.csv
orderitems <- olist_order_items_dataset.csv
payments <- olist_order_payments_dataset.csv
nametrans <- product_category_name_translation.csv
reviews <- olist_order_reviews_dataset.csv
closed <- olist_closed_deals_dataset.csv
marketing <- olist_marketing_qualified_leads_dataset.csv# 地理資料整理
geo$geolocation_lat<-round(geo$geolocation_lat,3)
geo$geolocation_lng<-round(geo$geolocation_lng,3)
selllocation<-geo %>% group_by(geolocation_city) %>% summarise(selllat = max(geolocation_lat),selllng=max(geolocation_lng))
custlocation<-geo %>% group_by(geolocation_city) %>% summarise(custlat = max(geolocation_lat),custlng=max(geolocation_lng))# 時間資料處理
orders$order_approved_at<-as.Date(orders$order_approved_at,format="%Y-%m-%d %H:%M:%S")
orders$order_purchase_timestamp<-as.Date(orders$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_carrier_date<-as.Date(orders$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_customer_date<-as.Date(orders$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
orders$order_estimated_delivery_date<-as.Date(orders$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")
orderitems$shipping_limit_date<-as.Date(orderitems$shipping_limit_date,format="%Y-%m-%d %H:%M:%S")# 把各個資料合併
M_1 <- merge(orderitems,sellers,by.x="seller_id",by.y="seller_id")
M_2 <- merge(orders,cust,by.x="customer_id",by.y="customer_id")
M_3 <- merge(M_2,M_1,by="order_id")
M_4 <- merge(M_3,products,by="product_id")
M_5 <- merge(M_4,payments,by="order_id")
M_6 <- merge(M_5,selllocation,by.x="seller_city",by.y="geolocation_city")
M_7 <- merge(M_6,custlocation,by.x="customer_city",by.y="geolocation_city")
colnames(nametrans) <- c("product_category_name","product_category_name_english")#計算買賣家之間的距離
dist_list <- list()
for (i in 1:nrow(M_7)) {
dist_list[[i]] <- gdist(lon.1 = M_7$selllng[i],
lat.1 = M_7$selllat[i],
lon.2 = M_7$custlng[i],
lat.2 = M_7$custlat[i],
units="miles")
}
M_7$distbtwn<-as.integer(dist_list)
M_8<-merge(M_7,nametrans,by="product_category_name")1-1.訂單數量
ts = as.POSIXct(as.character(olist_orders_dataset.csv$order_purchase_timestamp) , format="%Y-%m-%d %T")
ts.bym <- cut(ts, breaks = "month")
dfts <- data.frame(ts,ts.bym)
dim(dfts) #共有99441筆訂單## [1] 99441 2
ggplot(dfts, aes(ts.bym))+
geom_bar(fill="lightblue")+
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5))+
ylab("Count") +
xlab("Month") +
theme(panel.background = element_rect(fill = "#f1f1f1")) #最多銷售月份為2017-11-1ts.byH <- format(ts,format="%H") %>% data.frame()
ggplot(ts.byH ,aes(.))+
geom_bar(fill="lightblue")+
ylab("Count") +
xlab("Time") +
theme(panel.background = element_rect(fill = "#f1f1f1")) #中午到傍晚時段為下單尖峰##按照小時區分
DD.byh <- format(DD,format="%H")
D$byh <- DD.byh
#Rearranging data from D data frame
deaf = select(D, product_category_name_english,byh)
shook = group_by(deaf, byh, product_category_name_english) %>% summarise(n=n())
jump = spread(shook, byh, n)
jump[is.na(jump)] <- 0
#Converting product category column into rowname
jump = column_to_rownames(jump, loc = "product_category_name_english")
jump = as.matrix(jump)
#Using plot_ly () to draw the interactive heatmap
plot_ly(x=colnames(jump), y=rownames(jump), z = jump, type = "heatmap")1-2.賣家數量及地理分布
## [1] 2962
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_8,aes(x=selllng,y=selllat,color=seller_state),size=0.2)1-3.買家數量及地理分布
## [1] 92458
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_8,aes(x=custlng,y=custlat,color=customer_state),size=0.2)Q <- group_by(olist_customers_dataset.csv, customer_unique_id) %>% summarise(nid=n())
dim(Q) #共有96096名顧客## [1] 96096 2
##
## 1 2 3 4 5 6 7 9 17
## 0.969 0.029 0.002 0.000 0.000 0.000 0.000 0.000 0.000
##
## housewares computers_accessories furniture_decor
## 6527 7275 7513
## sports_leisure health_beauty bed_bath_table
## 7954 9103 10001
health_beauty <- filter(M_14_4, product_category_name_english=="health_beauty")
ggplot(health_beauty, aes(x=customer_state, y=price,col="grey")) +
geom_point(color="orange", size=4) +
geom_segment( aes(x=customer_state, xend=customer_state, y=0, yend=price))+theme_light() +
theme(
panel.grid.major.x = element_blank(),
panel.border = element_blank(),
axis.ticks.x = element_blank()
)1-4.產品類別銷售及地理區域市佔情形
ggplot() +
geom_bar(data= M_8,aes(product_category_name_english,fill=seller_state),width=1)+
coord_flip()+
theme(axis.text.y = element_text( size=5))+
ylab("商品類別") +
xlab("數量") Product2 <- group_by(M_15_3,business_segment)%>%summarize(score2=round(mean(review_score),2),percent=n()/nrow(M_15_3)*100)
plot_ly(Product2, labels = paste(Product2$business_segment,Product2$score2),values = Product2$percent, type = 'pie') %>%
layout(title = '各商品類別銷售百分比',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))2-1.每月的訂單數,新進的買、賣家數趨勢變化
marketingdata <- merge(closed[,c(1,2,6,8,9,12)],marketing[,c(1,4)],by = "mql_id")
M_9 <- merge(M_8,marketingdata,by = "seller_id",all.x = T)M_9[,9]=as.POSIXct(M_9[,9], format="%Y-%m-%d")
M_9$time_group <- format(M_9$order_purchase_timestamp,"%Y%m") %>% as.numeric()
M_9$time <- format(M_9$order_purchase_timestamp,"%Y%m") %>% paste0(.,"28") %>% as.Date("%Y%m%d")
time_group <- unique(M_9$time_group) %>% sort()
num_seller_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
function(i) setdiff(M_9$seller_id[M_9$time_group==time_group[i]],unique(M_9$seller_id[M_9$time_group<time_group[i]])) %>% length)
num_seller = c(0,num_seller_1[-24])
num_customer_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
function(i) setdiff(M_9$customer_unique_id[M_9$time_group==time_group[i]],unique(M_9$customer_unique_id[M_9$time_group<time_group[i]])) %>% length)
num_customer = c(0,num_customer_1[-24])
num_order <- M_9 %>% group_by(time) %>% summarise(
num_order = length(unique(order_id))
)
plot <- cbind(time=(sort(unique(M_9$time))),num_seller,num_customer,num_order[,2])data.ts<-zoo(plot,plot[,"time"])
plot <- data.ts[1:24,-1]
dygraph(plot,main = "Olist新進買賣家及訂單數走勢") %>%
dySeries("num_customer", label = "新進顧客數")%>%
dySeries("num_order", label = "訂單數") %>%
dySeries("num_seller", axis = 'y2', label = "新進賣家數") %>%
dyOptions( axisLineColor="orange",
gridLineColor="indianred" , fillGraph = F,fillAlpha = 0.2,
drawGrid = TRUE,drawPoints=TRUE, pointSize = 1 ) %>%
dyAxis("x", label = " 日期 ", drawGrid = F) %>%
dyAxis("y", label = " ", drawGrid = T) %>%
dyHighlight(highlightCircleSize = 3,
highlightSeriesBackgroundAlpha = 0.2) %>%
dyOptions(colors = RColorBrewer::brewer.pal(3, 'Dark2')) %>%
dyRangeSelector(height = 1)2-2.泡泡圖
3-1.地理區域和評分與銷售額的互動圖
#數據定義同公共部分
reviews <-olist_order_reviews_dataset.csv
orderitms <- olist_order_items_dataset.csv
sellers <- olist_sellers_dataset.csv
#A <- review of orderitems
A<-merge(x = reviews , y = orderitems, by = "order_id")
#E <- review of sellers with order information
E<-merge(x = A , y = sellers, by = "seller_id")#score: the average score of each city
score <- E %>%
group_by(seller_city) %>%
summarise(
score = mean(review_score),
pricesum = sum(price)+sum(freight_value)
) #geolocation: private dataframe, with state,lon and lat of each city
geolocation <- olist_geolocation_dataset.csv
geolocation['seller_city'] = geo['geolocation_city']
geolocation['state'] = geo["geolocation_state"]
geolocation['geolocation_zip_code_prefix'] = NULL
geolocation <- geo %>%
group_by(geolocation_city) %>%
summarise(
lat = mean(geolocation_lat),
lng = mean(geolocation_lng),
seller_city = geolocation_city[1],
state = geolocation_state[1]
)
#draw review of each state and use addPolygon to fig
city_score <- merge(score,geolocation,by="seller_city")
state_score <- city_score %>%
group_by(state) %>%
summarise(
score = mean(score),
pricesum = sum(pricesum)
)
#import Brazill json data
states <- geojsonio::geojson_read("C:\\Users\\User\\Desktop\\Group3\\midterm\\Brazil.json", what = "sp")
#color setting
bins_score <- c(0,1.0,2.0,3.0,4.0,5.0)
bins_money <- c(0,500,5000,10000,50000,100000,1000000,Inf)
pal_score <- colorBin("Blues", domain = state_score$score, bins = bins_score)
pal_money <- colorBin("Reds" ,domain = state_score$pricesum, bins = bins_money)
#add score Info of State
labels <- sprintf(
"<strong>%s</strong><br/> score: %g <br/> total price: %g ",
state_score$state,
state_score$score,
state_score$pricesum
) %>% lapply(htmltools::HTML)
citylabels <- sprintf(
"<strong>city: %s</strong><br/> score: %g <br/> total price: %g ",
city_score$seller_city,
city_score$score,
city_score$pricesum
)%>% lapply(htmltools::HTML)
#form a map of Brazil
map_Brazil <- leaflet(states) %>%
setView(lat=-22.074022, lng=-48.74026, zoom = 4) %>%
addProviderTiles("MapBox",group = "map", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN')))
#draw city inform
map_Brazil <- map_Brazil %>%
addProviderTiles("MapBox",group = "city", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
addMarkers(
lng = city_score$lng,
lat = city_score$lat,
label = citylabels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 4px"),
textsize = "15px",
direction = "auto"),
clusterOptions = markerClusterOptions(),
options = popupOptions(closeButton = TRUE),
group = "city"
) #draw state labels
map_Brazil <- map_Brazil %>%
#add state score information
addPolygons(
fillColor = ~pal_score(state_score$score),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
group = "state_score"
) %>%
#add state price information
addPolygons(
fillColor = ~pal_money(state_score$pricesum),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
group = "state_price"
) %>%
#add legends
addLegend(pal = pal_money, values = ~state_score$pricesum ,opacity =0.7,title = "price legend",
position = "bottomleft",group = "state_price" ) %>%
addLegend(pal = pal_score, values = ~state_score$score, opacity = 0.7, title = "score legend",
position = "bottomleft",group = "state_score") %>%
# add layer Control
addLayersControl(
overlayGroups = c("city","state_score","state_price")
) %>%
hideGroup("state_price") %>%
# add mini map
addProviderTiles(providers$Esri.WorldStreetMap) %>%
addMiniMap(
tiles = providers$Esri.WorldStreetMap,
toggleDisplay = TRUE)4-1.對評分的回歸模型
score <- M_9[,c(5,10:12,16,17,19,20,22,24:29,32,38,46)] %>% group_by(order_id) %>% summarise(
ship13 = mean(order_delivered_customer_date-order_approved_at),
ship12 = mean(order_delivered_carrier_date-order_approved_at),
ship23 = mean(order_delivered_customer_date-order_delivered_carrier_date),
ship_ratio = mean(freight_value/price),
price = sum(price),
AVEvolume = sum(product_length_cm*product_height_cm*product_width_cm)/max(order_item_id),
description = sum(product_description_lenght)/max(order_item_id),
photo_num = sum(product_photos_qty)/max(order_item_id),
pay_installment = sum(payment_installments)/max(order_item_id),
dist = sum(distbtwn)/max(order_item_id),
customer_state = customer_state[1],
seller_state = seller_state[1],
time_group = time_group[1]
)
reviews2 <- reviews %>% group_by(order_id) %>% summarise(
score = mean(review_score)
)score_lm<-merge(score,reviews2,by="order_id")
score_lm[,2:4] <- sapply(score_lm[,2:4], as.numeric)
score_lm <- score_lm[complete.cases(score_lm),]##
## Call:
## lm(formula = score ~ ., data = score_lm[, c(2:15)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1415 -0.4159 0.5115 0.7576 8.7138
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.467e+02 1.667e+01 8.797 < 2e-16 ***
## ship13 -4.935e-02 5.077e-04 -97.207 < 2e-16 ***
## ship12 -5.992e-03 1.248e-03 -4.800 1.59e-06 ***
## ship23 NA NA NA NA
## ship_ratio -1.026e-01 1.367e-02 -7.507 6.11e-14 ***
## price -1.803e-04 1.989e-05 -9.068 < 2e-16 ***
## AVEvolume 5.964e-07 1.449e-07 4.117 3.85e-05 ***
## description 1.746e-05 5.592e-06 3.122 0.00180 **
## photo_num 3.931e-04 2.038e-03 0.193 0.84703
## pay_installment -5.247e-03 1.555e-03 -3.374 0.00074 ***
## dist 2.896e-05 1.092e-05 2.653 0.00797 **
## customer_stateAL -3.049e-02 1.519e-01 -0.201 0.84094
## customer_stateAM 4.792e-01 1.722e-01 2.783 0.00539 **
## customer_stateAP 4.781e-01 2.030e-01 2.355 0.01853 *
## customer_stateBA -2.006e-01 1.407e-01 -1.426 0.15385
## customer_stateCE -8.222e-02 1.431e-01 -0.574 0.56564
## customer_stateDF -3.121e-01 1.417e-01 -2.203 0.02757 *
## customer_stateES -2.290e-01 1.419e-01 -1.614 0.10651
## customer_stateGO -2.120e-01 1.419e-01 -1.493 0.13534
## customer_stateMA -1.838e-01 1.464e-01 -1.256 0.20923
## customer_stateMG -3.013e-01 1.398e-01 -2.156 0.03113 *
## customer_stateMS -1.398e-01 1.465e-01 -0.954 0.33986
## customer_stateMT -4.708e-02 1.452e-01 -0.324 0.74583
## customer_statePA -5.394e-03 1.447e-01 -0.037 0.97025
## customer_statePB -1.621e-02 1.490e-01 -0.109 0.91339
## customer_statePE -8.902e-02 1.423e-01 -0.626 0.53158
## customer_statePI -1.089e-01 1.500e-01 -0.726 0.46804
## customer_statePR -2.555e-01 1.404e-01 -1.819 0.06889 .
## customer_stateRJ -3.674e-01 1.397e-01 -2.629 0.00856 **
## customer_stateRN 2.469e-02 1.502e-01 0.164 0.86945
## customer_stateRO 7.345e-02 1.600e-01 0.459 0.64622
## customer_stateRR 3.445e-01 2.407e-01 1.431 0.15239
## customer_stateRS -1.548e-01 1.402e-01 -1.104 0.26964
## customer_stateSC -2.168e-01 1.408e-01 -1.540 0.12359
## customer_stateSE -1.144e-01 1.543e-01 -0.741 0.45855
## customer_stateSP -3.909e-01 1.396e-01 -2.799 0.00512 **
## customer_stateTO -3.208e-02 1.575e-01 -0.204 0.83863
## seller_stateBA 3.103e-01 7.018e-01 0.442 0.65835
## seller_stateCE 5.480e-01 7.122e-01 0.769 0.44161
## seller_stateDF 1.547e-01 7.011e-01 0.221 0.82532
## seller_stateES 2.156e-01 7.033e-01 0.307 0.75922
## seller_stateGO 4.409e-01 7.021e-01 0.628 0.53003
## seller_stateMA 3.084e-01 7.025e-01 0.439 0.66065
## seller_stateMG 3.030e-01 6.999e-01 0.433 0.66504
## seller_stateMS 4.792e-01 7.227e-01 0.663 0.50731
## seller_stateMT 3.493e-01 7.075e-01 0.494 0.62158
## seller_statePA 6.677e-01 8.204e-01 0.814 0.41572
## seller_statePB 2.700e-01 7.339e-01 0.368 0.71299
## seller_statePE 2.939e-01 7.026e-01 0.418 0.67578
## seller_statePI 4.783e-01 7.894e-01 0.606 0.54454
## seller_statePR 3.045e-01 6.999e-01 0.435 0.66350
## seller_stateRJ 2.887e-01 7.001e-01 0.412 0.68003
## seller_stateRN 1.840e-01 7.205e-01 0.255 0.79847
## seller_stateRO 4.848e-02 8.568e-01 0.057 0.95487
## seller_stateRS 3.207e-01 7.003e-01 0.458 0.64697
## seller_stateSC 3.108e-01 7.001e-01 0.444 0.65708
## seller_stateSE -3.641e-01 8.204e-01 -0.444 0.65721
## seller_stateSP 1.824e-01 6.998e-01 0.261 0.79432
## time_group -7.028e-04 8.259e-05 -8.510 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.211 on 93393 degrees of freedom
## Multiple R-squared: 0.1247, Adjusted R-squared: 0.1242
## F-statistic: 233.4 on 57 and 93393 DF, p-value: < 2.2e-16
4-2.相關係數
## ship13 ship12 ship23 ship_ratio
## ship13 1.000000000 0.386467622 0.928326482 0.07662941
## ship12 0.386467622 1.000000000 0.015887260 -0.02122266
## ship23 0.928326482 0.015887260 1.000000000 0.09162889
## ship_ratio 0.076629410 -0.021222657 0.091628893 1.00000000
## price 0.048197974 0.066818713 0.025318109 -0.27239650
## AVEvolume 0.062146732 0.118137527 0.019754219 -0.08065473
## description 0.005079196 0.005716356 0.003202211 -0.11409592
## photo_num -0.022481239 -0.035293149 -0.010145917 -0.04701655
## pay_installment 0.074208135 0.057819481 0.057143319 -0.18573847
## dist 0.267048036 0.008095524 0.286245083 0.14047168
## time_group -0.059746756 -0.080666577 -0.032256348 0.00938864
## score -0.336192046 -0.154493835 -0.302193607 -0.02709736
## price AVEvolume description photo_num
## ship13 0.048197974 0.06214673 0.005079196 -0.02248124
## ship12 0.066818713 0.11813753 0.005716356 -0.03529315
## ship23 0.025318109 0.01975422 0.003202211 -0.01014592
## ship_ratio -0.272396502 -0.08065473 -0.114095916 -0.04701655
## price 1.000000000 0.30940086 0.267643426 0.11663583
## AVEvolume 0.309400865 1.00000000 0.115844242 0.10944865
## description 0.267643426 0.11584424 1.000000000 0.22384301
## photo_num 0.116635834 0.10944865 0.223843009 1.00000000
## pay_installment 0.316193547 0.17100803 0.074115328 0.03766222
## dist 0.112570130 0.05975062 0.139228914 0.11028820
## time_group -0.004081408 -0.03937762 0.022193502 -0.01354552
## score -0.031142695 -0.01878400 0.011170128 0.01252416
## pay_installment dist time_group score
## ship13 0.07420813 0.267048036 -0.059746756 -0.336192046
## ship12 0.05781948 0.008095524 -0.080666577 -0.154493835
## ship23 0.05714332 0.286245083 -0.032256348 -0.302193607
## ship_ratio -0.18573847 0.140471682 0.009388640 -0.027097359
## price 0.31619355 0.112570130 -0.004081408 -0.031142695
## AVEvolume 0.17100803 0.059750624 -0.039377624 -0.018784003
## description 0.07411533 0.139228914 0.022193502 0.011170128
## photo_num 0.03766222 0.110288200 -0.013545517 0.012524155
## pay_installment 1.00000000 0.087421435 -0.060023291 -0.030452988
## dist 0.08742144 1.000000000 -0.035967577 -0.038081568
## time_group -0.06002329 -0.035967577 1.000000000 -0.007807136
## score -0.03045299 -0.038081568 -0.007807136 1.000000000